###############################################################################
# This script cleans the referendums file for subsequent analysis

###############################################################################
rm(list = ls())
# Load main source of data (hand collected from PDFs)
df <- read.csv("Data/Raw Data/CA_schools_final.csv",
               header = TRUE,
               stringsAsFactors = FALSE)

# Remove community college referendums and those without LEAID match
df <- df[df$Community_college ==0,]
df <- df[!is.na(df$LEAID),]

# Clean type fields
df$type <- ifelse(df$type == "GO Bond","Bond",df$type)
df$type <- ifelse(df$levy_parcel >0,"levy_parcel",df$type)
df$type <- ifelse(!is.na(df$bond_amount),"Bond",df$type)
df$type <- ifelse(df$type == "Initiative","levy_parcel",df$type)# That one initiative in Orange County is a levy tax
df <- df[df$type %in% c("Bond","levy_parcel","Property Tax"),]

# Change types of variables
df$LEAID <- sprintf("%07d",df$LEAID)
df$date <- as.Date(df$date,format = "%m/%d/%Y")


###############
## Combine votes for school districts that span multiple counties
###############
  #FYI: school districts that span multiple counties enter the raw database as many times as counties. Because the analysis is at the referendum level (i.e. school disctrict), we combine the votes of such referendums enter the name of the school districts.


# Name of referendum in raw dataset
df$measure_name <- gsub(" ","",df$measure_name)

# Create an unique ID to identify a referendum based on School ID, name of measure, and date
df$ref_id <- paste(df$LEAID,df$measure_name,df$date,sep = "_")

# Aggregate votes based on this unique ID.
a <- aggregate(x = list(vote_for = df$vote_for,vote_against = df$vote_against,vote_total = df$vote_total),
               by = list(ref_id = df$ref_id),FUN = sum)

# Recreate main dataset (i.e. it will not have duplicated results)
df <- df[,c("date","LEAID","type","threshold","bond_amount","levy_parcel","ref_id")]
df <- unique(df)
df <- merge(df,a)
rm(a)
df$ref_id <- NULL
df$year <- as.numeric(substr(df$date,1,4))



################################################################################
# Get mean yields of GO bonds in CA schools
mean_yield <- read.csv("Data/Raw data/mean_bond_yields_CA_schools.csv")
df <- merge(df,mean_yield,by = "year")
rm(mean_yield)
df$cost_bond <- (df$yield/10000) * df$bond_amount


#################################################################################
# construct last passing
##

df$perc_yes <- df$vote_for /df$vote_total
df$Pass <- ifelse(df$perc_yes>df$threshold,1,0)
df$perc_yes <- df$perc_yes *100


df$month <- months(df$date)
df$election <- as.Date(paste(substr(df$date,1,7),"01",sep = "-"),format = "%Y-%m-%d")

df$last_el <- as.Date("2007-01-01")
df$last_win <- as.Date("2007-01-01")

# Search whether a previous referendum was held in the same school district
# then, verify whether it was a win or not.
for(i in 1:nrow(df)){
  
  el <- df[i,"election"]
  school <- df[i,"LEAID"]
  
  all <- df[df$election < el & df$LEAID == school,]
  
  if(nrow(all) !=0 ){
    df[i,"last_el"] <- max(all$election) # take the most recent
    
    allwin <- all[all$Pass == "1",]
    
    if(nrow(allwin) != 0){
      df[i,"last_win"] <- max(allwin$election)
    }
  }
  
  
}


df$tsince_last_win <- df$year - as.numeric(substr(df$last_win,1,4))
df$tsince_last_el <- df$year - as.numeric(substr(df$last_el,1,4))

# Defined failed recently as having had a referendum that did not pass in the last cycle of 4 years
df$failed_recently <- ifelse(df$last_el != df$last_win & df$tsince_last_el < 4,1,0)

df$tsince_last_el <- df$tsince_last_win <- df$last_el <- df$last_win <- NULL
rm(all,allwin,el,i,school)
summary(df$failed_recently)
#################
# Define elections
df$presidential_el <- ifelse(df$election %in% as.Date(c("2008-11-01","2012-11-01","2016-11-01","2020-11-01")),1,0)
df$midterm_el <- ifelse(df$election %in% as.Date(c("2010-11-01","2014-11-01","2018-11-01","2022-11-01")),1,0)
df$odd_year <- df$year %% 2


df$bond_amount <- as.numeric(df$bond_amount)
suppressWarnings(df$levy_parcel <- as.numeric(df$levy_parcel))
df$type <- factor(x = df$type,levels = c("Property Tax", "Bond", "levy_parcel"),labels = c("Property Tax", "Bond", "Parcel Levy"))
df$Bond <- ifelse(df$type == "Bond",1,0)
df$post <- ifelse(df$year >= 2019,1,0)

df$election_FE <- as.factor(ifelse(df$odd_year == 1, df$year,ifelse(df$month == "November",as.character(df$election),df$year)))
df$month <- NULL
#######
# Cellini et al. definition of outliers
df$tight_election <- ifelse(df$perc_yes <40 | df$perc_yes>90,0,1)

# Search for school district with a unique referendum
a <- as.data.frame(table(df$LEAID))
a <- a[a$Freq>1,]

df$one_election <- ifelse(df$LEAID %in% a$Var1,0,1)
rm(a)

